Multivariate data visualization

MACS 40700
University of Chicago

April 19, 2017

Working with dates in R

  • Numeric (20174018)
  • String (2016-04-18, April 18th, 2017, etc.)
  • Split into components
  • lubridate

Formatting dates

library(lubridate)
ymd("2017-01-31")
## [1] "2017-01-31"
mdy("January 31st, 2017")
## [1] "2017-01-31"
dmy("31-Jan-2017")
## [1] "2017-01-31"

Extracting date components

(flights <- read_csv("data/flights-departed.csv"))
## # A tibble: 7,671 × 2
##          date value
##        <date> <int>
## 1  1988-01-01 12681
## 2  1988-01-02 13264
## 3  1988-01-03 13953
## 4  1988-01-04 13921
## 5  1988-01-05 13932
## 6  1988-01-06 13157
## 7  1988-01-07 11159
## 8  1988-01-08 11631
## 9  1988-01-09 12045
## 10 1988-01-10 13160
## # ... with 7,661 more rows
ggplot(flights, aes(date, value)) +
  geom_line() +
  labs(x = NULL,
       y = "Number of departing commercial flights")

Extracting date components

(flights <- flights %>%
  mutate(year = year(date),
         yday = yday(date),
         # hack to label the x-axis with months
         days = dmy(format(date,"%d-%m-2016"))))
## # A tibble: 7,671 × 5
##          date value  year  yday       days
##        <date> <int> <dbl> <dbl>     <date>
## 1  1988-01-01 12681  1988     1 2016-01-01
## 2  1988-01-02 13264  1988     2 2016-01-02
## 3  1988-01-03 13953  1988     3 2016-01-03
## 4  1988-01-04 13921  1988     4 2016-01-04
## 5  1988-01-05 13932  1988     5 2016-01-05
## 6  1988-01-06 13157  1988     6 2016-01-06
## 7  1988-01-07 11159  1988     7 2016-01-07
## 8  1988-01-08 11631  1988     8 2016-01-08
## 9  1988-01-09 12045  1988     9 2016-01-09
## 10 1988-01-10 13160  1988    10 2016-01-10
## # ... with 7,661 more rows
ggplot(flights, aes(days, value)) +
  geom_line(aes(group = year), alpha = .2) +
  geom_smooth(se = FALSE) +
  scale_x_date(labels = scales::date_format("%b")) +
  labs(x = NULL,
       y = "Number of departing commercial flights")

Extracting date components

(flights <- flights %>%
  mutate(month = month(date, label = TRUE)))
## # A tibble: 7,671 × 6
##          date value  year  yday       days month
##        <date> <int> <dbl> <dbl>     <date> <ord>
## 1  1988-01-01 12681  1988     1 2016-01-01   Jan
## 2  1988-01-02 13264  1988     2 2016-01-02   Jan
## 3  1988-01-03 13953  1988     3 2016-01-03   Jan
## 4  1988-01-04 13921  1988     4 2016-01-04   Jan
## 5  1988-01-05 13932  1988     5 2016-01-05   Jan
## 6  1988-01-06 13157  1988     6 2016-01-06   Jan
## 7  1988-01-07 11159  1988     7 2016-01-07   Jan
## 8  1988-01-08 11631  1988     8 2016-01-08   Jan
## 9  1988-01-09 12045  1988     9 2016-01-09   Jan
## 10 1988-01-10 13160  1988    10 2016-01-10   Jan
## # ... with 7,661 more rows
ggplot(flights, aes(month, value)) +
  geom_violin() +
  geom_boxplot(width = .1, outlier.shape = NA) +
  labs(x = NULL,
       y = "Number of departing commercial flights")

Calendar heatmap

  • Layer
    • Data - flights
    • Mapping
      • \(x\) - weekday (e.g. Sunday, Monday, Tuesday)
      • \(y\) - week in month (e.g. first week, second week, third week)
      • Fill - value (number of departing flights)
    • Statistical transformation (stat) - identity
    • Geometric object (geom) - geom_tile()
    • Position adjustment (position) - none
  • Scale
    • Fill - low and high-end colors (use shading to identify in-between values)
  • Coordinate system - Cartesian coordinate plane
  • Faceting - facet_grid() (year X month)

Calendar heatmap

  • Year
  • Month
  • Weekday
  • Week-in-month

Calendar heatmap

(flights <- flights %>%
  mutate(weekday = wday(date, label = TRUE)))
## # A tibble: 7,671 × 7
##          date value  year  yday       days month weekday
##        <date> <int> <dbl> <dbl>     <date> <ord>   <ord>
## 1  1988-01-01 12681  1988     1 2016-01-01   Jan     Fri
## 2  1988-01-02 13264  1988     2 2016-01-02   Jan     Sat
## 3  1988-01-03 13953  1988     3 2016-01-03   Jan     Sun
## 4  1988-01-04 13921  1988     4 2016-01-04   Jan     Mon
## 5  1988-01-05 13932  1988     5 2016-01-05   Jan    Tues
## 6  1988-01-06 13157  1988     6 2016-01-06   Jan     Wed
## 7  1988-01-07 11159  1988     7 2016-01-07   Jan   Thurs
## 8  1988-01-08 11631  1988     8 2016-01-08   Jan     Fri
## 9  1988-01-09 12045  1988     9 2016-01-09   Jan     Sat
## 10 1988-01-10 13160  1988    10 2016-01-10   Jan     Sun
## # ... with 7,661 more rows
(flights <- flights %>%
  # generate variables for week in the year (1-54) and the day in the year (1-366)
  mutate(week = week(date),
         yday = yday(date)) %>%
  # normalize to draw calendar correctly - wday should represent the number of days from the Sunday of the week containing January 1st, then adjust based on that
  group_by(year) %>%
  mutate(yday = yday + wday(date)[1] - 2,
         week = floor(yday / 7)) %>%
  group_by(year, month) %>%
  mutate(week_month = week - min(week) + 1))
## Source: local data frame [7,671 x 9]
## Groups: year, month [252]
## 
##          date value  year  yday       days month weekday  week week_month
##        <date> <int> <dbl> <dbl>     <date> <ord>   <ord> <dbl>      <dbl>
## 1  1988-01-01 12681  1988     5 2016-01-01   Jan     Fri     0          1
## 2  1988-01-02 13264  1988     6 2016-01-02   Jan     Sat     0          1
## 3  1988-01-03 13953  1988     7 2016-01-03   Jan     Sun     1          2
## 4  1988-01-04 13921  1988     8 2016-01-04   Jan     Mon     1          2
## 5  1988-01-05 13932  1988     9 2016-01-05   Jan    Tues     1          2
## 6  1988-01-06 13157  1988    10 2016-01-06   Jan     Wed     1          2
## 7  1988-01-07 11159  1988    11 2016-01-07   Jan   Thurs     1          2
## 8  1988-01-08 11631  1988    12 2016-01-08   Jan     Fri     1          2
## 9  1988-01-09 12045  1988    13 2016-01-09   Jan     Sat     1          2
## 10 1988-01-10 13160  1988    14 2016-01-10   Jan     Sun     2          3
## # ... with 7,661 more rows

ggplot(flights, aes(weekday, week_month, fill = value)) +
  facet_grid(year ~ month) +
  geom_tile(color = "black") +
  scale_fill_continuous(low = "green", high = "red") +
  scale_x_discrete(labels = NULL) +
  scale_y_reverse(labels = NULL) +
  labs(title = "Domestic commercial flight activity",
       x = NULL,
       y = NULL,
       fill = "Number of departing flights") +
  theme_void() +
  theme(legend.position = "bottom",
        legend.text = element_text(angle = 45))

Smoothing lines

p +
  geom_smooth(method = "lm", se = FALSE)

p +
  geom_smooth(se = FALSE)

LOESS

LOESS

LOESS

Coefficient of correlation (\(r\))

  • Pearson’s \(r\)
  • Scales between \(-1\) and \(+1\)
    • \(-1\) – perfect negative association between the variables
    • \(+1\) – perfect positive association between the variables
    • \(0\) – no relationship between the variables
  • Unit-less measure

Coefficient of correlation (\(r\))

r_plot <- function(r, n = 100){
  xy <- ecodist::corgen(len = n, r = r) %>%
    bind_cols
  
  ggplot(xy, aes(x, y)) +
    geom_point() +
    ggtitle(str_c("Pearson's r = ", r))
}

r <- c(.8, 0, -.8)

for(r in r){
  print(r_plot(r))
}

Scatterplot matrix

pairs(select_if(credit, is.numeric))

Scatterplot matrix

library(GGally)

ggpairs(select_if(credit, is.numeric))

Scatterplot matrix

ggpairs(credit, mapping = aes(color = gender),
        columns = c("income", "limit", "rating", "cards", "age", "education", "balance"))

Scatterplot matrix

ggpairs(select_if(credit, is.numeric),
        lower = list(
          continuous = "smooth"
        )
)

Scatterplot matrix

ggpairs(select_if(credit, is.numeric),
        lower = list(
          continuous = wrap("smooth", alpha = .1, color = "blue")
        )
)

Scatterplot matrix

scatter_smooth <- function(data, mapping, ...) {
  ggplot(data = data, mapping = mapping) +
    # make data points transparent
    geom_point(alpha = .2) +
    # add default smoother
    geom_smooth(se = FALSE)
}

ggpairs(select_if(credit, is.numeric),
        lower = list(
          continuous = scatter_smooth
        )
)

ggpairs(credit, mapping = aes(color = gender),
        columns = c("income", "limit", "rating", "cards", "age", "education", "balance"),
        lower = list(
          continuous = scatter_smooth
        )
)

Scatterplot matrix

ggpairs(select(rcfss::scorecard, type:debt))

Heatmap of correlation coefficients

(mpg_lite <- select_if(mpg, is.numeric))
## # A tibble: 234 × 5
##    displ  year   cyl   cty   hwy
##    <dbl> <int> <int> <int> <int>
## 1    1.8  1999     4    18    29
## 2    1.8  1999     4    21    29
## 3    2.0  2008     4    20    31
## 4    2.0  2008     4    21    30
## 5    2.8  1999     6    16    26
## 6    2.8  1999     6    18    26
## 7    3.1  2008     6    18    27
## 8    1.8  1999     4    18    26
## 9    1.8  1999     4    16    25
## 10   2.0  2008     4    20    28
## # ... with 224 more rows
(cormat <- mpg_lite %>%
  cor %>%
  round(2))
##       displ  year   cyl   cty   hwy
## displ  1.00  0.15  0.93 -0.80 -0.77
## year   0.15  1.00  0.12 -0.04  0.00
## cyl    0.93  0.12  1.00 -0.81 -0.76
## cty   -0.80 -0.04 -0.81  1.00  0.96
## hwy   -0.77  0.00 -0.76  0.96  1.00
library(reshape2)
(melted_cormat <- melt(cormat))
##     Var1  Var2 value
## 1  displ displ  1.00
## 2   year displ  0.15
## 3    cyl displ  0.93
## 4    cty displ -0.80
## 5    hwy displ -0.77
## 6  displ  year  0.15
## 7   year  year  1.00
## 8    cyl  year  0.12
## 9    cty  year -0.04
## 10   hwy  year  0.00
## 11 displ   cyl  0.93
## 12  year   cyl  0.12
## 13   cyl   cyl  1.00
## 14   cty   cyl -0.81
## 15   hwy   cyl -0.76
## 16 displ   cty -0.80
## 17  year   cty -0.04
## 18   cyl   cty -0.81
## 19   cty   cty  1.00
## 20   hwy   cty  0.96
## 21 displ   hwy -0.77
## 22  year   hwy  0.00
## 23   cyl   hwy -0.76
## 24   cty   hwy  0.96
## 25   hwy   hwy  1.00
ggplot(melted_cormat, aes(x = Var1, y = Var2, fill = value)) + 
  geom_tile()

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
  cormat[upper.tri(cormat)] <- NA
  return(cormat)
}

# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
  cormat[lower.tri(cormat)]<- NA
  return(cormat)
}

upper_tri <- get_upper_tri(cormat)
upper_tri
##       displ year  cyl   cty   hwy
## displ     1 0.15 0.93 -0.80 -0.77
## year     NA 1.00 0.12 -0.04  0.00
## cyl      NA   NA 1.00 -0.81 -0.76
## cty      NA   NA   NA  1.00  0.96
## hwy      NA   NA   NA    NA  1.00
melted_cormat <- melt(upper_tri, na.rm = TRUE)

ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white") +
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1)) +
 coord_fixed()

reorder_cormat <- function(cormat){
  # Use correlation between variables as distance
  dd <- as.dist((1-cormat)/2)
  hc <- hclust(dd)
  cormat <-cormat[hc$order, hc$order]
}

# Reorder the correlation matrix
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)

# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)

# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()

# Print the heatmap
print(ggheatmap)

ggheatmap + 
  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    legend.position = "bottom")

cormat_heatmap <- function(data){
  # generate correlation matrix
  cormat <- round(cor(data), 2)
  
  # melt into a tidy table
  get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
  }
  
  upper_tri <- get_upper_tri(cormat)
  
  # reorder matrix based on coefficient value
  reorder_cormat <- function(cormat){
    # Use correlation between variables as distance
    dd <- as.dist((1-cormat)/2)
    hc <- hclust(dd)
    cormat <-cormat[hc$order, hc$order]
  }
  
  cormat <- reorder_cormat(cormat)
  upper_tri <- get_upper_tri(cormat)
  
  # Melt the correlation matrix
  melted_cormat <- melt(upper_tri, na.rm = TRUE)
  
  # Create a ggheatmap
  ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
    geom_tile(color = "white")+
    scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                         midpoint = 0, limit = c(-1,1), space = "Lab", 
                         name="Pearson\nCorrelation") +
    theme_minimal()+ # minimal theme
    theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                     size = 12, hjust = 1))+
    coord_fixed()
  
  # add correlation values to graph
  ggheatmap + 
    geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
    theme(
      axis.title.x = element_blank(),
      axis.title.y = element_blank(),
      panel.grid.major = element_blank(),
      panel.border = element_blank(),
      panel.background = element_blank(),
      axis.ticks = element_blank(),
      legend.position = "bottom")
}

cormat_heatmap(select_if(mpg, is.numeric))

cormat_heatmap(select_if(credit, is.numeric))

cormat_heatmap(select_if(diamonds, is.numeric))

Parallel coordinate plots

ggparcoord(data = iris, columns = 1:4, groupColumn = 5)

# with the iris data, order the axes by overall class (Species) separation
# using the anyClass option
ggparcoord(data = iris, columns = 1:4, groupColumn = 5, order = "anyClass")

# add points to the plot, add a title, and use an alpha scalar to make the
# lines transparent
p <- ggparcoord(data = iris, columns = 1:4, groupColumn = 5, order = "anyClass", 
    showPoints = TRUE, title = "Parallel Coordinate Plot for the Iris Data", 
    alphaLines = 0.3)
p

# add some basic interactivity
ggplotly(p)

Three dimensions

Three dimensions

## # A tibble: 1,317 × 5
##    vote96   age  educ female mhealth
##     <dbl> <dbl> <dbl>  <dbl>   <dbl>
## 1       1    60    12      0       0
## 2       1    36    12      0       1
## 3       0    21    13      0       7
## 4       0    29    13      0       6
## 5       1    39    18      1       2
## 6       1    41    15      1       1
## 7       1    48    20      0       2
## 8       0    20    12      1       9
## 9       0    27    11      1       9
## 10      0    34     7      1       2
## # ... with 1,307 more rows
##          term estimate std.error statistic  p.value
## 1 (Intercept)  -5.0244   0.44482     -11.3 1.38e-29
## 2         age   0.0469   0.00441      10.6 1.94e-26
## 3        educ   0.2816   0.02629      10.7 9.32e-27

3D scatter plot with mesh

plot_ly(vote_prob, x = ~age, y = ~educ, z = ~prob) %>%
  add_mesh()

3D scatter plot with mesh

plot_ly(credit, x = ~limit, y = ~balance, z = ~income) %>%
  add_mesh()

3D surface plot

plot_ly(z = ~volcano) %>% add_surface()
volcano %>%
  melt %>%
  ggplot(aes(Var1, Var2, z = value)) +
  geom_contour(aes(color = ..level..))